# 爬虫联系
# 博客地址
# 作者：zfh
# 邮箱：965412472@qq.com

from scrapy_web.htmlDownloader import HtmlDownloader
from scrapy_web.htmlPrase import HtmlPrase
from scrapy_web.urlManger import UrlManget


class SpiderMan(object):

    def __init__(self):
        self.Manger = UrlManget()
        self.Download = HtmlDownloader()
        self.Praser = HtmlPrase()

    def crawl(self, root_url):
        # 添加入口url
        self.Manger.add_new_url(root_url)
        # 判断url管理器中是否有新的url，同时判断抓取了多少个url
        while(self.Manger.has_new_url() and self.Manger.old_url_size()<100):
            try:
                # 从url管理器获取新的url
                new_url = self.Manger.get_new_url()
                print(new_url)
                # HTML下载器下载网页
                html = self.Download.download(new_url)
                # parser解析网页数据
                new_urls, data = self.Praser.parser("https://baike.baidu.com/", html)
                # 将抽取的url添加到url管理器中
                self.Manger.add_new_urls(new_urls)
                # 将抽取的data放到数据库中
                pass
                print("已经抓取%s"%self.Manger.old_url_size())
            except:
                print('获取连接失败')


if __name__ == '__main__':
    spiderman = SpiderMan()
    spiderman.crawl("https://baike.baidu.com/item/%E4%BF%A1%E7%94%A8%E5%8D%A1")